Supp Material for Non-Luminous Ostracod *Skogsbergia sp.*
The R Juypter Notebook serves as a comprehensive repository encompasing most of the scripts and figures relevant to the non-bioluminescent ostracod Skogsbergia sp. analyses outlined in the publication. This notebook includes the following analyses: QC steps, Differential Gene Expression (DGE), GO enrichments, Cross-Species DGE analysis, WGCNA and conservation of BCN orthologs.
Author: Lisa Yeter Mesrop
#load libraries
library(tidyverse)
library(edgeR)
library(matrixStats)
library(DESeq2)
library(dplyr)
library(readxl)
library(data.table)
library(ggplot2)
library(WGCNA)
library(VennDiagram)
library(purrr)
library(reshape2)
library(knitr)
library(RColorBrewer)
library(pheatmap)
library(topGO)
library(ggvenn)
#always use the following WGCNA functions
options(stringsAsFactors = FALSE);
enableWGCNAThreads();
allowWGCNAThreads(nThreads = 22)
Import Skogsbergia sp. gene expression matrix which consists of three tissue types - upper lip, compound eye and gut - with five biological replicates for each tissue. Generate the sample name sheet (meta sheet) for downstream DESeq2 analyses. Read in the annotation file for Skogsbergia sp. generated by Trinotate.
#read in gene expression matrix
skogs_counts <- read.delim("skogs_fasta90_isoform_combined.tab", header = TRUE, sep = "\t", quote = "")
head(skogs_counts)
#fix the column names
row.names(skogs_counts) <-skogs_counts$X
head(skogs_counts)
#remove the column X and extra X.1
skogs_counts$X.1 <- NULL
skogs_counts$X <- NULL
head(skogs_counts)
meta <- data.frame(row.names = colnames(skogs_counts))
head(meta)
sample_name = c("Upper_lip", "Eye", "Gut", "Upper_lip", "Eye", "Gut", "Upper_lip", "Eye", "Gut", "Upper_lip", "Eye", "Gut","Upper_lip", "Eye", "Gut")
meta$sample_name <- sample_name
meta$names <- rownames(meta)
rownames(meta) <- NULL
meta
#import count table, meta and sample_name into a DESeq2 object
dds_count_table <- DESeqDataSetFromMatrix(countData = skogs_counts, colData = meta, design = ~sample_name)
nrow(counts(dds_count_table))
#the number of prefiltered counts for each sample
colSums(assay(dds_count_table))
The counts of raw reads were examined using a bar plot.
#visualize prefiltered raw counts in a barplot
librarySizes <- colSums(assay(dds_count_table))
par(mar=c(10,5,2,2))
barplot(librarySizes,
las=2,
cex.names=.5,
main="Barplot of raw count distributions of samples")
# run DESeq2 function and normalization
dds_raw_counts <- DESeq(dds_count_table, betaPrior = FALSE, parallel = TRUE)
# Perform a variance-stabilizing transformation
vsd_raw_counts <- varianceStabilizingTransformation(dds_raw_counts)
sampleDists_raw_counts <- dist(t(assay(vsd_raw_counts)))
#plot the heatmap
sampleDists_raw_counts_Matrix <- as.matrix(sampleDists_raw_counts)
rownames(sampleDists_raw_counts_Matrix) <- paste(colData(dds_raw_counts)$sample_name)
colnames(sampleDists_raw_counts_Matrix) <- colData(dds_raw_counts)$names
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDists_raw_counts_Matrix,
clustering_distance_rows=sampleDists_raw_counts,
clustering_distance_cols=sampleDists_raw_counts,
col=colors)
#plot the PCA
pcaData_raw_counts <- plotPCA(vsd_raw_counts, intgroup="sample_name", returnData=TRUE)
percentVar_raw_counts <- round(100 * attr(pcaData_raw_counts, "percentVar"))
ggplot(pcaData_raw_counts, aes(PC1, PC2, color=sample_name, shape=sample_name)) +
geom_point(size=5) +
xlab(paste0("PC1: ",percentVar_raw_counts[1],"% variance")) +
ylab(paste0("PC2: ",percentVar_raw_counts[2],"% variance")) +
geom_point(size=5) +
theme(panel.grid.major = element_line(colour = "gray97", size = 1), panel.grid.minor = element_line(linetype = "dotted"), panel.background = element_rect(fill = NA),
legend.key = element_rect(fill = "gray100")) + theme(axis.line = element_line(size = 0.5,linetype = "solid")) +
theme(panel.border = element_rect(colour = "black", fill=NA, size=1)) +
coord_fixed() +
scale_color_manual(values = c('#F2C93D','#C97D97','#F1AFB4'))+theme(
legend.title = element_text(size = 16),
legend.text = element_text(size = 14),
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
axis.text = element_text(size = 12)
)
options(repr.plot.width=8, repr.plot.height=6, repr.plot.res = 150)
#read in the Trinotate sheet for Skogsbergia sp.
Trinotate_lym_subset_skogs <- read.csv(file = "Trinotate_lym_subset_Skogsbergia_cdhit90_longestisoform.csv")
As a preliminary quality control (QC) measure for WGCNA analysis, the overall similarity between samples and transcripts with low counts was assessed, as these counts often introduce noise in co-expression analyses. A filter was applied to the expression matrix, removing transcripts with fewer than 5 counts in more than 5 samples, given that some sample types included a minimum of 5 biological replicates. The QC analyses utilized functions from the DESeq2 package (Love et al., 2014). The WGCNA analysis is used for construction of co-expression networks for Skogsbergia sp. and conservation of BCN orthologs in the networks of the non-luminous Skogsbergia sp. in Section 7.
#import count table into a DESeq2
dds_count_table_wgcna <- DESeqDataSetFromMatrix(countData = skogs_counts, colData = meta, design = ~sample_name)
#filter the count table
dds_merged_table_prefiltered_wgcna <- dds_count_table_wgcna[rowSums(counts(dds_count_table_wgcna) >= 5) >=5,];
nrow(dds_merged_table_prefiltered_wgcna
# run DESeq2 function and normalization
dds_prefiltered_wgcna <- DESeq(dds_merged_table_prefiltered_wgcna, betaPrior = FALSE, parallel = TRUE) ;#make sure that these are default parameters, betaPrior and parallel
# Perform a variance-stabilizing transformation
vsd_prefiltered_wgcna <- varianceStabilizingTransformation(dds_prefiltered_wgcna)
#transpose the matrix
sampleDists_wgcna <- dist(t(assay(vsd_prefiltered_wgcna)))
#plot the heatmap
sampleDistMatrix_wgcna <- as.matrix(sampleDists_wgcna)
rownames(sampleDistMatrix_wgcna) <- paste(colData(dds_merged_table_prefiltered_wgcna)$sample_name)
colnames(sampleDistMatrix_wgcna) <- colData(dds_merged_table_prefiltered_wgcna)$names
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix_wgcna,
clustering_distance_rows=sampleDists_wgcna,
clustering_distance_cols=sampleDists_wgcna,
col=colors)
#plot the PCA
pcaData_prefiltered_wgcna <- plotPCA(vsd_prefiltered_wgcna, intgroup="sample_name", returnData=TRUE)
percentVar_prefiltered_wgcna <- round(100 * attr(pcaData_prefiltered_wgcna, "percentVar"))
ggplot(pcaData_prefiltered_wgcna, aes(PC1, PC2, color=sample_name, shape=sample_name)) +
geom_point(size=5) +
theme(panel.grid.major = element_line(colour = "gray97", size = 1), panel.grid.minor = element_line(linetype = "dotted"), panel.background = element_rect(fill = NA),
legend.key = element_rect(fill = "gray100")) + theme(axis.line = element_line(size = 0.5,linetype = "solid")) +
theme(panel.border = element_rect(colour = "black", fill=NA, size=1)) +
scale_color_manual(values = c('#C24C3D','#8E3DC2','#E69F00'))+
xlab(paste0("PC1: ",percentVar_raw_counts[1],"% variance")) +
ylab(paste0("PC2: ",percentVar_raw_counts[2],"% variance")) +
coord_fixed()
Differential gene expression analysis was carried out in DESeq2 (Love et al., 2014). For Skogsbergia sp. , we determined differentially upregulated genes in three tissue types - upper lip, compound eye and gut - using five biological replicates for each tissue. DESeq2 was employed using a p-value < 0.05 and FC > 1.5 for the significance of differentially expressed genes using the Benjamini-Hochberg method to account for false discovery rate (FDR). Pairwise comparisons were done across tissue types (i.e., upper lip to compound eye, upper lip to gut, gut to compound eye). To identify tissue-specific differential expression (i.e. significantly upregulated genes that are uniquely expressed), each tissue was compared to the other two. For example, the expression in the upper lip was determined by comparing it to both the compound eye and the gut tissues. For each pairwise comparison, the reference tissue was specified to determine the significantly upregulated genes in each tissue type (i.e., positive vs negative logfold change).
dds_count_table
#run DESeq2 analysis
dds_DE <- DESeq(dds_count_table)
#set eye as a reference tissue
dds_DE$sample_name <- relevel(dds_DE$sample_name, ref= "Eye")
#rerun DESeq command after reference is specified
dds_DE <- DESeq(dds_DE)
#define contrasts, extract results table, and shrink the log2 fold changes
res_tableOE_unshrunken_UpperlipVs_Eye <- results(dds_DE, contrast= c("sample_name", "Upper_lip", "Eye") , alpha = 0.05)
res_tableOE_UpperlipVs_Eye <- lfcShrink(dds_DE, contrast= c("sample_name", "Upper_lip", "Eye"), res = res_tableOE_unshrunken_UpperlipVs_Eye)
mcols(res_tableOE_UpperlipVs_Eye, use.names=T)
# set thresholds
# lfc.cutoff value of 0.58 translates to a 1.5 log2 fold change
# padj.cutoff value of 0.05
padj.cutoff <- 0.05
lfc.cutoff <- 0.58
res_tableOE_UpperlipVs_Eye_tb <- res_tableOE_UpperlipVs_Eye %>%
data.frame() %>%
rownames_to_column(var="gene") %>%
as_tibble()
#determine all differentially expressed genes
sigOE_UpperlipVs_Eye <- res_tableOE_UpperlipVs_Eye_tb %>%
filter(padj < padj.cutoff & abs(log2FoldChange) > lfc.cutoff)
colnames(sigOE_UpperlipVs_Eye)[1]<- "transcript_id"
head(sigOE_UpperlipVs_Eye)
#extract all genes that are significantly upregulated in the Upper Lip (positive log2 fold change)
Upperlip_Vs_eye_sigOE_UPREGULATED_logfold <- sigOE_UpperlipVs_Eye %>%
filter(padj < padj.cutoff & log2FoldChange > lfc.cutoff)
colnames(Upperlip_Vs_eye_sigOE_UPREGULATED_logfold)[1]<- "transcript_id"
#add the annotation
Upperlip_Vs_eye_sigOE_UPREGULATED_logfold_annot <- setDT(Trinotate_lym_subset_skogs, key = 'transcript_id')[J(Upperlip_Vs_eye_sigOE_UPREGULATED_logfold)]
#genes that are significantly upregulated in the Upper Lip (positive log2 fold change)
head(Upperlip_Vs_eye_sigOE_UPREGULATED_logfold_annot)
#extract all genes that are significantly upregulated in the Compound Eye (negative log2fold change) but that are downregulated in the upper lip.
Upperlip_Vs_eye_sigOE_DOWNREGULATED_logfold <- sigOE_UpperlipVs_Eye %>%
filter(padj < padj.cutoff & log2FoldChange < lfc.cutoff)
#save these two dataframes for downstream analysis in Section 4.4
#genes that are significantly upregulated in the Upper Lip (positive log2 fold change)
head(Upperlip_Vs_eye_sigOE_UPREGULATED_logfold)
#genes that significantly upregulated in the Compound Eye (negative log2fold change) but that are downregulated in the upper lip.
head(Upperlip_Vs_eye_sigOE_DOWNREGULATED_logfold)
#define contrasts, extract results table, and shrink the log2 fold changes
res_tableOE_unshrunken_GutVsEye <- results(dds_DE, contrast= c("sample_name", "Gut", "Eye") , alpha = 0.05)
res_tableOE_GutVsEye <- lfcShrink(dds_DE, contrast= c("sample_name", "Gut", "Eye"), res = res_tableOE_unshrunken_GutVsEye)
mcols(res_tableOE_GutVsEye, use.names=T)
res_tableOE_tb_GutVsEye <- res_tableOE_GutVsEye %>%
data.frame() %>%
rownames_to_column(var="gene") %>%
as_tibble()
#determine all differentially expressed genes
sigOE_GutVsEye <- res_tableOE_tb_GutVsEye %>%
filter(padj < padj.cutoff & abs(log2FoldChange) > lfc.cutoff)
colnames(sigOE_GutVsEye)[1]<- "transcript_id"
#extract all genes that are significantly upregulated in the gut (positive log2 fold change)
GutVsEye_sigOE_UPREGULATED_logfold <- sigOE_GutVsEye %>%
filter(padj < padj.cutoff & log2FoldChange > lfc.cutoff)
#match the nodes
GutVsEye_sigOE_UPREGULATED_logfold_annot <- setDT(Trinotate_lym_subset_skogs, key = 'transcript_id')[J(GutVsEye_sigOE_UPREGULATED_logfold)]
#genes that are significantly upregulated in the gut (positive log2 fold change)
head(GutVsEye_sigOE_UPREGULATED_logfold_annot)
#extract all genes that are significantly upregulated in the Compound Eye (negative log2fold change) but that are downregulated in the gut.
#extract all genes that are significantly upregulated in the Compound Eye
GutVsEye_sigOE_DOWNREGULATED_logfold <- sigOE_GutVsEye %>%
filter(padj < padj.cutoff & log2FoldChange < lfc.cutoff)
#save these two dataframes for downstream analysis
#genes that are significantly upregulated in the Gut (positive log2 fold change)
head(GutVsEye_sigOE_UPREGULATED_logfold)
#genes that are significantly upregulated in the Compound Eye ((negative log2fold change) but that are downregulated in the gut
head(GutVsEye_sigOE_DOWNREGULATED_logfold)
#now set gut as a reference tissue
dds_DE$sample_name <- relevel(dds_DE$sample_name, ref= "Gut")
#rerun DESeq
dds_DE <- DESeq(dds_DE)
#define contrasts, extract results table, and shrink the log2 fold changes
res_tableOE_unshrunken_UpperLipVsGut <- results(dds_DE, contrast= c("sample_name", "Upper_lip", "Gut") , alpha = 0.05)
res_tableOE_UpperLipVsGut <- lfcShrink(dds_DE, contrast= c("sample_name", "Upper_lip", "Gut"), res = res_tableOE_unshrunken_UpperLipVsGut)
mcols(res_tableOE_UpperLipVsGut, use.names=T)
res_tableOE_tb_UpperLipVsGut <- res_tableOE_UpperLipVsGut %>%
data.frame() %>%
rownames_to_column(var="gene") %>%
as_tibble()
#determine all differentially expressed genes
sigOE_UpperLipVsGut <- res_tableOE_tb_UpperLipVsGut %>%
filter(padj < padj.cutoff & abs(log2FoldChange) > lfc.cutoff)
#extract all genes that are significantly upregulated in the upper lip (positive log2 fold change)
UpperLipVsGut_sigOE_UPREGULATED_logfold <- sigOE_UpperLipVsGut %>%
filter(padj < padj.cutoff & log2FoldChange > lfc.cutoff)
#genes that are significantly upregulated in the upper lip (positive log2 fold change)
head(UpperLipVsGut_sigOE_UPREGULATED_logfold )
#extract all genes that are significantly upregulated in the Gut (negative log2fold change)
UpperLipVsGut_sigOE_DOWNREGULATED_logfold <- sigOE_UpperLipVsGut %>%
filter(padj < padj.cutoff & log2FoldChange < lfc.cutoff)
#extract all genes that are significantly upregulated in the Gut (negative log2fold change) but that are downregulated in the Upper Lip.
head(UpperLipVsGut_sigOE_DOWNREGULATED_logfold)
#save these two dataframes for downstream analysis
#genes that are significantly upregulated in the upper lip (positive log2 fold change)
head(UpperLipVsGut_sigOE_UPREGULATED_logfold)
#genes that are significantly upregulated in the gut (negative log2fold change) but that are downregulated in the Upper Lip
head(UpperLipVsGut_sigOE_DOWNREGULATED_logfold)
To identify tissue-specific differential expression (i.e., significantly upregulated genes that are uniquely expressed), each tissue was compared to the other two and tissue-specific genes were extracted from the intersection of the Venn diagram (in Section 4.4.4). For example, the expression in the upper lip was determined by comparing it to both the compound eye and gut tissues.
#create a dataframe with all significantly upregulated genes of the upper lip
#merge dataframes that have significantly upregulated genes of the upper lip from pairwise comparisons - Upper Lip vs Gut and Upper Lip vs Eye
head(UpperLipVsGut_sigOE_UPREGULATED_logfold)
head(Upperlip_Vs_eye_sigOE_UPREGULATED_logfold)
colnames(UpperLipVsGut_sigOE_UPREGULATED_logfold)[1]<- "gene"
colnames(Upperlip_Vs_eye_sigOE_UPREGULATED_logfold)[1]<- "gene"
#merge
merged_upper_lips_df <- rbind(
UpperLipVsGut_sigOE_UPREGULATED_logfold,
Upperlip_Vs_eye_sigOE_UPREGULATED_logfold
)
head(merged_upper_lips_df)
#the same gene can be found in both Upper Lip vs Gut and Upper Lip vs Eye. Remove gene duplicates while retaining one duplicate.
merged_upper_lips_unique <- merged_upper_lips_df[!duplicated(merged_upper_lips_df$gene), ]
head(merged_upper_lips_unique)
#create a dataframe with all significantly upregulated genes of the compound eye
#merge dataframes that have significantly upregulated genes of the compound eye from pairwise comparisons - Upper Lip vs Compound Eye and Gut vs Compound Eye
#genes that significantly upregulated in the Compound Eye
head(Upperlip_Vs_eye_sigOE_DOWNREGULATED_logfold)
#genes that are significantly upregulated in the Compound Eye
head(GutVsEye_sigOE_DOWNREGULATED_logfold)
colnames(Upperlip_Vs_eye_sigOE_DOWNREGULATED_logfold)[1]<- "gene"
colnames(GutVsEye_sigOE_DOWNREGULATED_logfold)[1]<- "gene"
#merged eye
merged_Eye_df <- rbind(Upperlip_Vs_eye_sigOE_DOWNREGULATED_logfold , GutVsEye_sigOE_DOWNREGULATED_logfold)
head(merged_Eye_df)
#the same gene can be found in both Upper Lip vs Compound Eye and Gut vs Compound Eye. Remove gene duplicates while retaining one duplicate.
merged_Eye_unique <-merged_Eye_df[!duplicated(merged_Eye_df$gene), ]
head(merged_Eye_unique)
#create a dataframe with all significantly upregulated genes of the gut
#merge dataframes that have significantly upregulated genes of the gut from pairwise comparisons - Gut vs Eye and Upper lip vs Gut
#genes that are significantly upregulated in the Gut
head(GutVsEye_sigOE_UPREGULATED_logfold)
#genes that are significantly upregulated in the gut
head(UpperLipVsGut_sigOE_DOWNREGULATED_logfold)
colnames(GutVsEye_sigOE_UPREGULATED_logfold)[1]<- "gene"
colnames(UpperLipVsGut_sigOE_DOWNREGULATED_logfold)[1]<- "gene"
#merge
merged_Gut_df <- rbind(GutVsEye_sigOE_UPREGULATED_logfold , UpperLipVsGut_sigOE_DOWNREGULATED_logfold)
head(merged_Gut_df)
#the same gene can be found in both Gut vs Eye and Upper lip vs Gut. Remove gene duplicates while retaining one duplicate.
merged_Gut_unique <-merged_Gut_df[!duplicated(merged_Gut_df$gene), ]
head(merged_Gut_unique)
#generate a venn diagram to visualize shared significantly upregulated genes across tissue types and extract the genes that are unique to each tissue type.
unique_venn_list <- list(
Upper_Lip = merged_upper_lips_unique$gene ,
Gut = merged_Gut_unique$gene,
Compound_Eye = merged_Eye_unique$gene
)
ggvenn_unique <- ggvenn(
unique_venn_list,
fill_color = c('#F1AFB4','#C97D97', '#F2C93D'),
stroke_size = .7, set_name_size = 6, text_size = 5
)
ggvenn_unique
# Open a PDF device
pdf("Non_Luminous_DGE.pdf", width = 8, height = 6)
ggvenn_unique
dev.off()
#prep dataframes for extraction
Upper_Lip <- as.data.frame(merged_upper_lips_unique$gene)
colnames(Upper_Lip)[1]<- "gene"
Gut <- as.data.frame(merged_Gut_unique$gene)
colnames(Gut)[1]<- "gene"
Compound_eye <- as.data.frame(merged_Eye_unique$gene)
colnames(Compound_eye)[1]<- "gene"
# compare and extract unique genes for each tissue type
unique_genes_upper_lip <- anti_join(Upper_Lip, Gut, by = "gene") %>%
anti_join(Compound_eye, by = "gene")
unique_genes_gut <- anti_join(Gut, Upper_Lip, by = "gene") %>%
anti_join(Compound_eye, by = "gene")
unique_genes_compound_eye <- anti_join(Compound_eye, Upper_Lip, by = "gene") %>%
anti_join(Gut, by = "gene")
nrow(unique_genes_upper_lip)
nrow(unique_genes_gut)
nrow(unique_genes_compound_eye)
#add the annotations back to the unique genes in each tissue type by subsetting
unique_genes_upper_lip_info <- merged_upper_lips_unique %>%
filter(gene %in% unique_genes_upper_lip$gene)
nrow(unique_genes_upper_lip_info)
unique_genes_eye_info <- merged_Eye_unique %>%
filter(gene %in% unique_genes_compound_eye$gene)
nrow(unique_genes_eye_info)
unique_genes_gut_info <- merged_Gut_unique %>%
filter(gene %in% unique_genes_gut$gene)
nrow(unique_genes_gut_info)
#add annotations back for upper lip
colnames(unique_genes_upper_lip_info)[1]<- "transcript_id"
unique_genes_upper_lip_info_annot <- left_join(unique_genes_upper_lip_info,Trinotate_lym_subset_skogs,by="transcript_id")
#write.csv(unique_genes_upper_lip_info_annot, file = "df_Skogs_sigfig_upreg_unique_Upper_Lip.csv")
#add annotations back for eyes
colnames(unique_genes_eye_info)[1]<- "transcript_id"
unique_genes_eye_info_annot <- left_join(unique_genes_eye_info,Trinotate_lym_subset_skogs,by="transcript_id")
#write.csv(unique_genes_eye_info_annot, file = "df_Skogs_sigfig_upreg_unique_comEye.csv")
#add annotations back for gut
colnames(unique_genes_gut_info)[1] <- "transcript_id"
unique_genes_gut_info_annot <- left_join(unique_genes_gut_info,Trinotate_lym_subset_skogs,by="transcript_id")
#write.csv(unique_genes_gut_info_annot, file = "df_Skogs_sigfig_upreg_unique_Gut.csv")
Compared the number of significantly upregulated genes (expressed uniquely) between upper lips and compound eyes of luminous and non-luminous ostracods.
#import V.tsujii compound eyes
vtsujii_eye <- read.csv("df_Vtsujii_sigfig_upreg_unique_comEye.csv", header = TRUE, row.names=1,
stringsAsFactors = FALSE)
#change column name
colnames(vtsujii_eye)[1]<- "gene"
vargulatsujii_v_skogsbergia_orthologs_factor <- read.csv("Vargula_tsujii_cdhit_95.fasta.transdecoder__v__Skogsbergia_sp.csv")
head(vargulatsujii_v_skogsbergia_orthologs_factor)
#find the orthogroups that have V.tsujii DE compound eye transcripts
vtsujii_DE_eye <- subset(vtsujii_eye, select = "gene")
head(vtsujii_DE_eye)
# run the function and change the column names accordingly
check_and_subset <- function(df1, df2) {
matched_rows_list <- list()
for (i in 1:nrow(df1)) {
char_row <- df1[i, "gene"]
matched_rows <- df2[str_detect(df2$Vargula_tsujii_cdhit_95.fasta.transdecoder, paste0(char_row, collapse = "|")), , drop = FALSE]
if (nrow(matched_rows) > 0) {
matched_rows_list[[i]] <- matched_rows
}
}
if (length(matched_rows_list) > 0) {
matched_df <- bind_rows(matched_rows_list)
return(matched_df)
} else {
return(NULL)
}
}
# Use the function
vtsujii_DE_eye_orthogroups <- check_and_subset(vtsujii_DE_eye,
vargulatsujii_v_skogsbergia_orthologs_factor)
head(vtsujii_DE_eye_orthogroups)
vtsujii_DE_eye_orthogroups_numbers <- subset(vtsujii_DE_eye_orthogroups, select = "Orthogroup")
head(vtsujii_DE_eye_orthogroups_numbers)
vtsujii_DE_eye_orthogroups_numbers_rmdup <- vtsujii_DE_eye_orthogroups_numbers %>% distinct()
nrow(vtsujii_DE_eye_orthogroups_numbers_rmdup)
skogs_DE_eye <- subset(unique_genes_compound_eye, select = "gene")
colnames(skogs_DE_eye)[1]<- "transcript_id"
head(skogs_DE_eye)
# use this function and change the column names
check_and_subset <- function(df1, df2) {
matched_rows_list <- list()
for (i in 1:nrow(df1)) {
char_row <- df1[i, "transcript_id"]
matched_rows <- df2[str_detect(df2$Skogsbergia_sp, paste0(char_row, collapse = "|")), , drop = FALSE]
if (nrow(matched_rows) > 0) {
matched_rows_list[[i]] <- matched_rows
}
}
if (length(matched_rows_list) > 0) {
matched_df <- bind_rows(matched_rows_list)
return(matched_df)
} else {
return(NULL)
}
}
# Use the function
skogs_DE_eye_orthogroups <- check_and_subset(skogs_DE_eye, vargulatsujii_v_skogsbergia_orthologs_factor)
head(skogs_DE_eye_orthogroups)
skogs_DE_eye_orthogroups_numbers <- subset(skogs_DE_eye_orthogroups, select = "Orthogroup" )
skogs_DE_eye_orthogroups_numbers_rmdup <- skogs_DE_eye_orthogroups_numbers %>% distinct()
nrow(skogs_DE_eye_orthogroups_numbers_rmdup)
head(vtsujii_DE_eye_orthogroups_numbers_rmdup$Orthogroup)
head(skogs_DE_eye_orthogroups_numbers_rmdup$Orthogroup)
#convert columns to characters for the function below
vtsujii_DE_eye_orthogroups_numbers_rmdup <- vtsujii_DE_eye_orthogroups_numbers_rmdup %>% mutate(Orthogroup = as.character(Orthogroup))
skogs_DE_eye_orthogroups_numbers_rmdup <- skogs_DE_eye_orthogroups_numbers_rmdup %>% mutate(Orthogroup = as.character(Orthogroup))
#start with the longer list first which is V.tsujii
vtsujii_skogs_orthogroups_numbers_match <- vtsujii_DE_eye_orthogroups_numbers_rmdup %>%
mutate(match = c("no", "yes")[1 + (rowSums(
outer(
strsplit(Orthogroup, "\\s+"),
strsplit(skogs_DE_eye_orthogroups_numbers_rmdup$Orthogroup, "\\s+"),
Vectorize(function(x, y) all(x %in% y) | all(y %in% x))
)
) > 0)])
head(vtsujii_skogs_orthogroups_numbers_match)
vtsujii_skogs_orthogroups_numbers_match_yes <- vtsujii_skogs_orthogroups_numbers_match %>% filter(match=="yes")
vtsujii_skogs_orthogroups_numbers_match_yes
#take the Orthogroup numbers
vtsujii_skogs_orthogroups_numbers_match_yes_ch <- as.character(vtsujii_skogs_orthogroups_numbers_match_yes$Orthogroup)
vargulatsujii_v_skogsbergia_SHARED_ORTHOGROUPS_EYES <- vargulatsujii_v_skogsbergia_orthologs_factor %>% filter(Orthogroup %in% vtsujii_skogs_orthogroups_numbers_match_yes_ch)
head(vargulatsujii_v_skogsbergia_SHARED_ORTHOGROUPS_EYES)
#now extract all transcripts
vargulatsujii_v_skogsbergia_SHARED_ORTHOGROUPS_EYES_transcript_ids <- as.character(vargulatsujii_v_skogsbergia_SHARED_ORTHOGROUPS_EYES$Vargula_tsujii_cdhit_95.fasta.transdecoder)
vargulatsujii_v_skogsbergia_SHARED_ORTHOGROUPS_EYES_transcript_ids_unlist <- unlist(strsplit(vargulatsujii_v_skogsbergia_SHARED_ORTHOGROUPS_EYES_transcript_ids,","))
#remove p*
vargulatsujii_v_skogsbergia_SHARED_ORTHOGROUPS_EYES_transcript_ids_unlist_removep <- vargulatsujii_v_skogsbergia_SHARED_ORTHOGROUPS_EYES_transcript_ids_unlist %>% str_replace("(.p1)", "") %>%
str_replace("(.p2)", "")
#found out there is a space in front of certain transcripts
vargulatsujii_v_skogsbergia_SHARED_ORTHOGROUPS_EYES_transcript_ids_unlist_removep_removespace <- trimws(vargulatsujii_v_skogsbergia_SHARED_ORTHOGROUPS_EYES_transcript_ids_unlist_removep)
vtsujii_eye_orthogroups_transcript_ids <- vtsujii_eye %>% filter (gene %in% vargulatsujii_v_skogsbergia_SHARED_ORTHOGROUPS_EYES_transcript_ids_unlist_removep_removespace)
head(vtsujii_eye_orthogroups_transcript_ids)
nrow(vtsujii_eye_orthogroups_transcript_ids )
#now extract all transcripts
vargulatsujii_v_skogsbergia_SHARED_ORTHOGROUPS_EYES_skogs_transcript_ids <- as.character(vargulatsujii_v_skogsbergia_SHARED_ORTHOGROUPS_EYES$Skogsbergia_sp)
vargulatsujii_v_skogsbergia_SHARED_ORTHOGROUPS_EYES_skogs_transcript_ids_unlist <- unlist(strsplit(vargulatsujii_v_skogsbergia_SHARED_ORTHOGROUPS_EYES_skogs_transcript_ids,","))
#remove p*
vargulatsujii_v_skogsbergia_SHARED_ORTHOGROUPS_EYES_skogs_transcript_ids_unlist_removep <- vargulatsujii_v_skogsbergia_SHARED_ORTHOGROUPS_EYES_skogs_transcript_ids_unlist %>% str_replace("(.p1)", "") %>%
str_replace("(.p2)", "")
#found out there is a space in front of certain transcripts
vargulatsujii_v_skogsbergia_SHARED_ORTHOGROUPS_EYES_skogs_transcript_ids_unlist_removep_removespace <- trimws(vargulatsujii_v_skogsbergia_SHARED_ORTHOGROUPS_EYES_skogs_transcript_ids_unlist_removep)
skogs_eye_orthogroups_transcript_ids <- skogs_DE_eye %>% filter (transcript_id %in% vargulatsujii_v_skogsbergia_SHARED_ORTHOGROUPS_EYES_skogs_transcript_ids_unlist_removep_removespace)
skogs_eye_orthogroups_transcript_ids
nrow(skogs_eye_orthogroups_transcript_ids)
#import V.tsujii upper lip
vtsujii_bio_upper_lip <- read.csv("df_Vtsujii_sigfig_upreg_unique_BioUpperLip.csv", header = TRUE, row.names=1,
stringsAsFactors = FALSE)
#use the already imported sheet all orthologs from Section 5.1.1
head(vargulatsujii_v_skogsbergia_orthologs_factor)
vtsujii_bio_upper_lip_ids <- subset(vtsujii_bio_upper_lip, select = "transcript_id")
# make sure to change the column names
check_and_subset <- function(df1, df2) {
matched_rows_list <- list()
for (i in 1:nrow(df1)) {
char_row <- df1[i, "transcript_id"]
matched_rows <- df2[str_detect(df2$Vargula_tsujii_cdhit_95.fasta.transdecoder, paste0(char_row, collapse = "|")), , drop = FALSE]
if (nrow(matched_rows) > 0) {
matched_rows_list[[i]] <- matched_rows
}
}
if (length(matched_rows_list) > 0) {
matched_df <- bind_rows(matched_rows_list)
return(matched_df)
} else {
return(NULL)
}
}
# use the function
vtsujii_bio_upper_lip_orthogroups <- check_and_subset(vtsujii_bio_upper_lip_ids,
vargulatsujii_v_skogsbergia_orthologs_factor)
head(vtsujii_bio_upper_lip_orthogroups)
#extract all the orthogroups
vtsujii_bio_upper_lip_orthogroups_numbers <- subset(vtsujii_bio_upper_lip_orthogroups, select = "Orthogroup")
head(vtsujii_bio_upper_lip_orthogroups_numbers)
vtsujii_bio_upper_lip_orthogroups_numbers_rmdup <- vtsujii_bio_upper_lip_orthogroups_numbers %>% distinct()
nrow(vtsujii_bio_upper_lip_orthogroups_numbers_rmdup)
#significantly upregulated genes (uniquely expressed) in the upper lip from Section 4.4.4
head(unique_genes_upper_lip_info)
colnames(unique_genes_upper_lip_info)[1] <- "transcript_id"
skogs_upper_lip_ids <- subset(unique_genes_upper_lip_info, select = "transcript_id")
# use this function and change the column names
check_and_subset <- function(df1, df2) {
matched_rows_list <- list()
for (i in 1:nrow(df1)) {
char_row <- df1[i, "transcript_id"]
matched_rows <- df2[str_detect(df2$Skogsbergia_sp, paste0(char_row, collapse = "|")), , drop = FALSE]
if (nrow(matched_rows) > 0) {
matched_rows_list[[i]] <- matched_rows
}
}
if (length(matched_rows_list) > 0) {
matched_df <- bind_rows(matched_rows_list)
return(matched_df)
} else {
return(NULL)
}
}
# use the function
skogs_upper_lip_orthogroups <- check_and_subset(skogs_upper_lip_ids, vargulatsujii_v_skogsbergia_orthologs_factor)
head(skogs_upper_lip_orthogroups)
skogs_upper_lip_orthogroups_numbers <- subset(skogs_upper_lip_orthogroups, select = "Orthogroup" )
skogs_upper_lip_orthogroups_numbers_rmdup <- skogs_upper_lip_orthogroups_numbers %>% distinct()
nrow(skogs_upper_lip_orthogroups_numbers_rmdup)
head(vtsujii_bio_upper_lip_orthogroups_numbers_rmdup$Orthogroup)
head(skogs_upper_lip_orthogroups_numbers_rmdup$Orthogroup)
#convert columns to characters for the function below
vtsujii_bio_upper_lip_orthogroups_numbers_rmdup_chr <- vtsujii_bio_upper_lip_orthogroups_numbers_rmdup %>% mutate(Orthogroup = as.character(Orthogroup))
skogs_upper_lip_orthogroups_numbers_rmdup_chr <- skogs_upper_lip_orthogroups_numbers_rmdup %>% mutate(Orthogroup = as.character(Orthogroup))
#start with the longer list first which is V.tsujii
vtsujii_skogs_upper_lips_orthogroups_numbers_match <- vtsujii_bio_upper_lip_orthogroups_numbers_rmdup_chr %>%
mutate(match = c("no", "yes")[1 + (rowSums(
outer(
strsplit(Orthogroup, "\\s+"),
strsplit(skogs_upper_lip_orthogroups_numbers_rmdup_chr$Orthogroup, "\\s+"),
Vectorize(function(x, y) all(x %in% y) | all(y %in% x))
)
) > 0)])
head(vtsujii_skogs_upper_lips_orthogroups_numbers_match)
vtsujii_skogs_upper_lips_orthogroups_numbers_match_yes <- vtsujii_skogs_upper_lips_orthogroups_numbers_match %>% filter(match=="yes")
vtsujii_skogs_upper_lips_orthogroups_numbers_match_yes
#take the Orthogroup numbers
vtsujii_skogs_upper_lips_orthogroups_numbers_match_yes_ch <- as.character(vtsujii_skogs_upper_lips_orthogroups_numbers_match_yes$Orthogroup)
vargulatsujii_v_skogsbergia_SHARED_ORTHOGROUPS_UPPER_LIPS <- vargulatsujii_v_skogsbergia_orthologs_factor %>% filter(Orthogroup %in% vtsujii_skogs_upper_lips_orthogroups_numbers_match_yes_ch)
head(vargulatsujii_v_skogsbergia_SHARED_ORTHOGROUPS_UPPER_LIPS)
#now extract all transcripts
vargulatsujii_v_skogsbergia_SHARED_ORTHOGROUPS_BIO_UPPER_LIPS_transcript_ids <- as.character(vargulatsujii_v_skogsbergia_SHARED_ORTHOGROUPS_UPPER_LIPS$Vargula_tsujii_cdhit_95.fasta.transdecoder)
vargulatsujii_v_skogsbergia_SHARED_ORTHOGROUPS_BIO_UPPER_LIPS_transcript_ids_unlist <- unlist(strsplit(vargulatsujii_v_skogsbergia_SHARED_ORTHOGROUPS_BIO_UPPER_LIPS_transcript_ids,","))
#remove p*
vargulatsujii_v_skogsbergia_SHARED_ORTHOGROUPS_BIO_UPPER_LIPS_transcript_ids_unlist_removep <- vargulatsujii_v_skogsbergia_SHARED_ORTHOGROUPS_BIO_UPPER_LIPS_transcript_ids_unlist %>% str_replace("(.p1)", "") %>%
str_replace("(.p2)", "")
#remove extra space in front of transcript ids
vargulatsujii_v_skogsbergia_SHARED_ORTHOGROUPS_BIO_UPPER_LIPS_transcript_ids_unlist_removep_trimws <- trimws(vargulatsujii_v_skogsbergia_SHARED_ORTHOGROUPS_BIO_UPPER_LIPS_transcript_ids_unlist_removep)
vtsujii_bio_upper_lip_orthogroups_transcript_ids <- vtsujii_bio_upper_lip %>% filter (transcript_id %in% vargulatsujii_v_skogsbergia_SHARED_ORTHOGROUPS_BIO_UPPER_LIPS_transcript_ids_unlist_removep_trimws)
head(vtsujii_bio_upper_lip_orthogroups_transcript_ids)
nrow(vtsujii_bio_upper_lip_orthogroups_transcript_ids)
#write.csv(vtsujii_bio_upper_lip_orthogroups_transcript_ids, file = "vtsujii_bio_upper_lip_orthogroups_transcript_ids.csv")
shared_bio_upper_lip_transcript_ids <- vtsujii_bio_upper_lip_orthogroups_transcript_ids$transcript_id
shared_bio_upper_lip_transcript_ids
#now extract all transcripts
vargulatsujii_v_skogsbergia_SHARED_ORTHOGROUPS_skogs_UPPER_LIP_transcript_ids <- as.character(vargulatsujii_v_skogsbergia_SHARED_ORTHOGROUPS_UPPER_LIPS$Skogsbergia_sp)
vargulatsujii_v_skogsbergia_SHARED_ORTHOGROUPS_skogs_UPPER_LIP_transcript_ids_unlist <- unlist(strsplit(vargulatsujii_v_skogsbergia_SHARED_ORTHOGROUPS_skogs_UPPER_LIP_transcript_ids,","))
#remove p*
vargulatsujii_v_skogsbergia_SHARED_ORTHOGROUPS_skogs_UPPER_LIP_transcript_ids_unlist_removep <- vargulatsujii_v_skogsbergia_SHARED_ORTHOGROUPS_skogs_UPPER_LIP_transcript_ids_unlist %>% str_replace("(.p1)", "") %>%
str_replace("(.p2)", "")
#remove white spaces
vargulatsujii_v_skogsbergia_SHARED_ORTHOGROUPS_skogs_UPPER_LIP_transcript_ids_unlist_removep_trimws <- trimws(vargulatsujii_v_skogsbergia_SHARED_ORTHOGROUPS_skogs_UPPER_LIP_transcript_ids_unlist_removep)
skogs_upper_lip_orthogroups_transcript_ids <- unique_genes_upper_lip_info_annot %>% filter (transcript_id %in% vargulatsujii_v_skogsbergia_SHARED_ORTHOGROUPS_skogs_UPPER_LIP_transcript_ids_unlist_removep_trimws)
head(skogs_upper_lip_orthogroups_transcript_ids)
#write.csv(skogs_upper_lip_orthogroups_transcript_ids, file = "skogs_upper_lip_orthogroups_transcript_ids.csv")
nrow(skogs_upper_lip_orthogroups_transcript_ids)
shared_skogs_transcript_ids <- skogs_upper_lip_orthogroups_transcript_ids$transcript_id
shared_skogs_transcript_ids
#import the trinotate go sheet from Trinotate output
geneID2GO <- readMappings(file ="go_annotations_cdhit90_longestisoform.txt")
#save the transcript ids of all the annotated genes under geneNames object
geneNames<- as.character(Trinotate_lym_subset_skogs$transcript_id)
#significantly upregulated genes (i.e. expressed uniquely) in the upper lip
head(unique_genes_upper_lip_info)
#save the transcript
myInterestingGenes= as.character(unique_genes_upper_lip_info$transcript_id)
#subset the genesNames by the transcript IDs in my red module
geneList <- factor(as.integer(geneNames %in% myInterestingGenes))
names(geneList) <- geneNames
head(geneList)
#run the topGO function for BP
GOdata <- new("topGOdata", ontology = "BP", allGenes = geneList,
annot = annFUN.gene2GO, gene2GO = geneID2GO)
results_go <- runTest(GOdata, algorithm="weight01", statistic="Fisher")
#retrieve the GO enrichment
goEnrichment <- GenTable(GOdata, Fisher = results_go, orderBy = "Fisher", topNodes = 10000, numChar=1000)
goEnrichment$Fisher <- as.numeric(goEnrichment$Fisher)
goEnrichment <- goEnrichment[goEnrichment$Fisher < 0.05,]
goEnrichment <- goEnrichment[goEnrichment$Significant > 1,]
goEnrichment <- goEnrichment[,c("GO.ID","Term", "Annotated", "Significant", "Expected", "Fisher")]
goEnrichment
#write.table(goEnrichment, "df_TopGO_Skogsbergia_sp_DE_unique_Upper_Lip_BP.tsv",sep = "\t", quote=FALSE)
myterms =goEnrichment$GO.ID
mygenes = genesInTerm(GOdata, myterms)
#extract the transcript ids for each GO term
var=c()
for (i in 1:length(myterms))
{
myterm <- myterms[i]
mygenesforterm <- mygenes[myterm][[1]]
myfactor <- mygenesforterm %in% myInterestingGenes
mygenesforterm2 <- mygenesforterm[myfactor == TRUE]
mygenesforterm2 <- paste(mygenesforterm2, collapse=',')
var[i]=paste(myterm,"genes:",mygenesforterm2)
}
# GO enrichment Upper Lip - BP
ntop = 50
ggdata <- goEnrichment[1:ntop,]
ggdata$Term <- factor(ggdata$Term, levels = rev(ggdata$Term))
plot_GO_UL_BP <- ggplot(ggdata,
aes(x = Term, y = -log10(Fisher), size = Significant, fill = -log10(Fisher))) +
expand_limits(y = 1) +
geom_point(shape = 21) +
scale_size(range = c(2,7)) +
scale_fill_continuous(low = 'royalblue', high = 'red4') +
labs(
title = 'GO Analysis - Biological Process')+
theme_bw(base_size = 24) +
labs(size= "Number of Genes")+
theme(
legend.position = 'right',
legend.background = element_rect(),
plot.title = element_text(angle = 0, size = 16, face = 'bold', vjust = 1),
plot.subtitle = element_text(angle = 0, size = 10, face = 'bold', vjust = 1),
plot.caption = element_text(angle = 0, size = 12, face = 'bold', vjust = 1),
axis.text.x = element_blank(),
axis.text.y = element_text(angle = 0, size = 13, face = 'bold', vjust = 0.5),
axis.title.x = element_blank(),
axis.title.y = element_text(size = 8, face = 'bold'),
axis.line = element_line(colour = 'black'),
legend.key = element_blank(),
legend.text = element_text(size = 14, face = "bold"),
title = element_text(size = 14, face = "bold")) +
coord_flip()
#dev.off()
plot_GO_UL_BP + labs(x = NULL)
#library (repr)
options(repr.plot.width=12, repr.plot.height=8, repr.plot.res = 500)
#run the topGO function
GOdata_MF <- new("topGOdata", ontology = "MF", allGenes = geneList,
annot = annFUN.gene2GO, gene2GO = geneID2GO)
results_go_MF <- runTest(GOdata_MF, algorithm="weight01", statistic="Fisher")
#retrieve the GO enrichment
goEnrichment_MF <- GenTable(GOdata_MF, Fisher = results_go_MF, orderBy = "Fisher", topNodes = 100, numChar=1000)
#lets graph the GO enrichment (I adapted this code from online)
goEnrichment_MF$Fisher <- as.numeric(goEnrichment_MF$Fisher)
goEnrichment_MF <- goEnrichment_MF[goEnrichment_MF$Fisher < 0.05,]
goEnrichment_MF <- goEnrichment_MF[goEnrichment_MF$Significant > 1,]
goEnrichment_MF <- goEnrichment_MF[,c("GO.ID","Term", "Annotated", "Significant", "Expected", "Fisher")]
goEnrichment_MF
#write.table(goEnrichment_MF, "df_TopGO_Skogsbergia_sp_DE_unique_Upper_Lip_MF.tsv",sep = "\t", quote=FALSE)
# GO enrichment Upper Lip - MF
ntop = 22
ggdata <- goEnrichment_MF[1:ntop,]
ggdata$Term <- factor(ggdata$Term, levels = rev(ggdata$Term))
plot_GO_UL_MF <- ggplot(ggdata,
aes(x = Term, y = -log10(Fisher), size = Significant, fill = -log10(Fisher))) +
expand_limits(y = 1) +
geom_point(shape = 21) +
scale_size(range = c(2,7)) +
scale_fill_continuous(low = 'royalblue', high = 'red4') +
labs(
title = 'GO Analysis - Molecular Function')+
theme_bw(base_size = 24) +
labs(size= "Number of Genes")+
theme(
legend.position = 'right',
legend.background = element_rect(),
plot.title = element_text(angle = 0, size = 16, face = 'bold', vjust = 1),
plot.subtitle = element_text(angle = 0, size = 10, face = 'bold', vjust = 1),
plot.caption = element_text(angle = 0, size = 12, face = 'bold', vjust = 1),
axis.text.x = element_blank(),
axis.text.y = element_text(angle = 0, size = 13, face = 'bold', vjust = 0.5),
axis.title.x = element_blank(),
axis.title.y = element_text(size = 8, face = 'bold'),
axis.line = element_line(colour = 'black'),
legend.key = element_blank(),
legend.text = element_text(size = 14, face = "bold"),
title = element_text(size = 14, face = "bold")) +
coord_flip()
#dev.off()
plot_GO_UL_MF +labs(x=NULL)
#library (repr)
options(repr.plot.width=8, repr.plot.height=5)
To determine if the BCN shares similarities and is conserved in the co-expression networks of the non-bioluminescent relative, we checked whether genes in the BCN that are orthologous to genes in the Skogsbergia sp. transcriptome are conserved in their interactions in co-expression networks of Skogsbergia sp.. Co-expression networks for Skogsbergia sp. were generated with WGCNA using 15 total samples from three tissues (upper lip, gut, and compound eye), each with 5 biological replicates. WGCNA (Weighted Gene Co-expression Network Analysis) identifies clusters (modules) of highly correlated genes by constructing a network based on pairwise correlations between gene expression profiles (Langfelder and Horvath, 2008). These modules often correspond to specific biological processes or pathways, indicating that the genes within a module may be part of the same regulatory process. By analyzing the connectivity of genes within each module, WGCNA also helps identify key drivers or hub genes, providing insights into gene regulation and the biological processes they govern. The following scripts are from the WGCNA package (Langfelder and Horvath, 2008).
#grab the count matrix from the DEseq2 object
vsd_matrix <- assay(vsd_prefiltered_wgcna)
## many functions expect the matrix to be transposed
datExpr <- t(vsd_matrix)
## check rows/cols
nrow(datExpr)
ncol(datExpr)
head(datExpr)
head(vsd_matrix)
#run this to check if there are gene outliers
gsg=goodSamplesGenes(datExpr, verbose = 3)
gsg$allOK
# Choose a set of soft-thresholding powers
powers = c(c(1:10), seq(from = 12, to=20, by=2))
# Call the network topology analysis function
sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)
# Plot the results
cex1 = 0.9;
# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers,cex=cex1,col="red");
# this line corresponds to using an R^2 cut-off of h
abline(h=0.90,col="red")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
# Co-expression similarity and adjacency using assigned softpower
softPower=7
adjacency = adjacency(datExpr, power = softPower)
# Topological Overlap Matrix (TOM)
# Turn adjacency into topological overlap, i.e. translate the adjacency into
# topological overlap matrix and calculate the corresponding dissimilarity:
TOM = TOMsimilarity(adjacency, TOMType = "signed", verbose = 5);
dissTOM = 1-TOM;
# Call the hierarchical clustering function
geneTree = hclust(as.dist(dissTOM), method = "average");
minModuleSize = 50
# Module identification using dynamic tree cut:
dynamicMods = cutreeDynamic(dendro = geneTree, distM = dissTOM,
deepSplit = 2, pamRespectsDendro = FALSE,
minClusterSize = minModuleSize);
table(dynamicMods)
dynamicColors = labels2colors(dynamicMods)
table(dynamicColors)
# Plot the dendrogram and colors underneath
#sizeGrWindow(8,6)
plotDendroAndColors(geneTree, dynamicColors, "Dynamic Tree Cut",
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05,
main = "Gene dendrogram and module colors")
# Calculate eigengenes
MEList = moduleEigengenes(datExpr, colors = dynamicColors)
MEs = MEList$eigengenes
# Calculate dissimilarity of module eigengenes
MEDiss = 1-cor(MEs);
# Cluster module eigengenes
METree = hclust(as.dist(MEDiss), method = "average");
# Plot the result
#sizeGrWindow(7, 6)
plot(METree, main = "Clustering of module eigengenes",
xlab = "", sub = "")
MEDissThres = 0.30
# Plot the cut line into the dendrogram
abline(h=MEDissThres, col = "red")
# Call an automatic merging function
merge = mergeCloseModules(datExpr, dynamicColors, cutHeight = MEDissThres, verbose = 3)
# The merged module colors
mergedColors = merge$colors;
# Eigengenes of the new merged modules:
mergedMEs = merge$newMEs;
plotDendroAndColors(geneTree, cbind(dynamicColors, mergedColors),
c("Dynamic Tree Cut", "Merged dynamic"),
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)
table(mergedColors)
Identify modules (network) that are significantly associated with samples. The module eigengene provides a representative measure of the gene expression patterns within a module, allowing correlation with these traits to determine the most significant associations (Langfelder and Horvath, 2008). This correlation can then be visualized in a module-to-trait heatmap to determine the most significant associations.The following scripts are from the WGCNA package (Langfelder and Horvath, 2008).
skogs_traits <- read_excel("skogs_trait.xlsx")
skogs_traits
sample_names_numeric <- c("1", "2", "3", "1", "2" ,"3", "1", "2", "3", "1", "2", "3", "1", "2", "3")
skogs_traits$sample_names_numeric <- sample_names_numeric
skogs_traits
sample_name
skogsSamples =as.character(sample_name)
traitRows =match(skogsSamples,skogs_traits$Tissue_Type)
traitRows
skogsTraits =skogs_traits[traitRows, -1]
skogsTraits
rownames(skogsTraits)=skogs_traits$Sample_Name
skogsTraits$Tissue_Type <- NULL
skogsTraits$sample_names_numeric <- NULL
rownames(skogsTraits)=skogs_traits$Sample_Name
skogsTraits
# Re-cluster samples
sampleTree2 = hclust(dist(datExpr), method = "average")
# Convert traits to a color representation: white means low, red means high, grey means missing entry
traitColors = numbers2colors(skogsTraits, signed = FALSE);
# Plot the sample dendrogram and the colors underneath.
plotDendroAndColors(sampleTree2, traitColors,
groupLabels = names(skogsTraits),
main = "Sample dendrogram and trait heatmap")
#Define numbers of genes and samples
nGenes = ncol(datExpr);
nSamples = nrow(datExpr);
mergedMEs
names(mergedMEs)
moduleTraitCor = cor(mergedMEs, skogsTraits, use = "p");
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples);
#view the graph
# Will display correlations and their p-values
textMatrix = paste(signif(moduleTraitCor, 2), "\n(",
signif(moduleTraitPvalue, 1), ")", sep = "");
dim(textMatrix) = dim(moduleTraitCor)
par(mar = c(6, 10, 6, 1) )
# Display the correlation values within a heatmap plot
labeledHeatmap(moduleTraitCor,
xLabels = names(skogsTraits),
yLabels = names(mergedMEs),
ySymbols = names(mergedMEs),
colorLabels = FALSE,
colors = blueWhiteRed(50),
textMatrix = textMatrix,
setStdMargins = FALSE,
cex.text = 0.5,
textAdj = c(0.5, 0.5),
zlim = c(-1,1),
main = paste("Module-trait relationships"))
#library (repr)
options(repr.plot.width=10, repr.plot.height=10)
wgcna_adjacency <- function(datExpr, minModuleSize=50, MEDissThres = .30, deepSplit = 2) {
#compute adjacency
adjacency <- adjacency(datExpr, power = 7)
TOM <- TOMsimilarity(adjacency,TOMType="signed")
geneTree <- hclust(as.dist(1-TOM), method = "average")
# Module identification using dynamic tree cut:
dynamicMods <- cutreeDynamic(dendro = geneTree, distM = 1-TOM, deepSplit = 2, pamRespectsDendro = FALSE, minClusterSize = minModuleSize);
table(dynamicMods)
dynamicColors = labels2colors(dynamicMods)
# Calculate eigengenes
MEList = moduleEigengenes(datExpr, colors = dynamicColors)
MEs = MEList$eigengenes
# Calculate dissimilarity of module eigengenes
METree = hclust(as.dist(1-cor(MEs)), method = "average");
plot(METree, main = "Clustering of module eigengenes",xlab = "", sub = "")
# Plot the cut line into the dendrogram
abline(h=MEDissThres, col = "red")
merge <- mergeCloseModules(datExpr, dynamicColors, cutHeight = MEDissThres, verbose = 0)
# The merged module colors
mergedColors = merge$colors
# Eigengenes of the new merged modules:
mergedMEs = merge$newMEs
# Rename to moduleColors
moduleColors = mergedColors
# Construct numerical labels corresponding to the colors
colorOrder = c("grey", standardColors(50));
moduleLabels = match(moduleColors, colorOrder)-1;
MEs = mergedMEs;
# Recalculate MEs with color labels
invisible(MEs0 <- moduleEigengenes(datExpr, moduleColors)$eigengenes)
MEs = orderMEs(MEs0)
print(table(moduleColors))
moduleColors <- as.data.frame(moduleColors)
rownames(moduleColors) <- colnames(datExpr)
return(list(adjacency=adjacency,MEs=MEs,moduleColors=moduleColors, dynamicMods=dynamicMods, geneTree=geneTree))
}
wgcna.results <- wgcna_adjacency(datExpr, MEDissThres = 0.30)
#wgcna.results
dynamicColors<- labels2colors(wgcna.results$dynamicMods)
merge <- mergeCloseModules(datExpr, dynamicColors, cutHeight = 0.30, verbose = 0)
mergedColors<-merge$colors
mergedMEs<-merge$newMEs
vsd_matrix_df <- as.data.frame(vsd_matrix)
vsd_matrix_df$colors <- wgcna.results[["moduleColors"]]$moduleColors
head(vsd_matrix_df)
#how to make rownames the first column
vsd_matrix_df$transcript_id <-rownames(vsd_matrix_df)
transcripts_to_module <- na.omit(vsd_matrix_df) %>% dplyr::select(transcript_id, colors)
head(transcripts_to_module)
BCN_on_to_one_orthologs_skogs_networks <- transcripts_to_module %>% filter(transcript_id %in% BCN_one_to_one_orthologs_skogs$transcript_id)
BCN_on_to_one_orthologs_skogs_networks_ordered_color <- BCN_on_to_one_orthologs_skogs_networks %>% arrange(colors)
BCN_on_to_one_orthologs_skogs_networks_ordered_color
# the BCN one to one orthologs are found across 24 modules
length(unique(BCN_on_to_one_orthologs_skogs_networks_ordered_color$colors))
as.data.frame(table(BCN_on_to_one_orthologs_skogs_networks_ordered_color$colors))
To test the hypothesis of biological significance for co-expression modules, it was checked whether modules correlated with eye tissue contain genes related to eye function and development.
SubGeneNames = colnames(datExpr)
green_eye_module=as.data.frame(SubGeneNames[which(mergedColors=="green")])
#match the nodes
names(green_eye_module)[1] <- "transcript_id"
green_eye_module_trinotate <- setDT(Trinotate_lym_subset_skogs, key = 'transcript_id')[J(green_eye_module)]
#write.csv( green_eye_module, file = "green_eye_module_Skogs.csv")
head(green_eye_module_trinotate)
To determine whether the BCN is conserved, a randomization test was performed to assess if 51 BCN orthologs, selected at random from all possible one-to-one expressed orthologs, are found in fewer modules than a random set of genes
#import the BCN one-to-one ortholog sheet
BCN_one_to_one_orthologs_skogs <- read.csv("BCN_one_to_one_orthologs_skogs_ids_unlist_removep_unique.csv" ,header = TRUE, row.names=1, stringsAsFactors = FALSE)
head(BCN_one_to_one_orthologs_skogs)
colnames(BCN_one_to_one_orthologs_skogs)[1] <- "transcript_id"
#total of 79 one to one orthologs across V.tsujii and Skogsbergia sp. transcriptomes but only 51 transcripts are expressed in the networks of Skogsbergia sp.
nrow(BCN_one_to_one_orthologs_skogs)
#import ALL the one-to-one orthologs V.tsujii and Skogsbergia sp. transcriptomes.
ALL_one_to_one_orthologs_vtsujii_skogs <- read.csv("ALL_one_to_one_orthologs_vtsujii_skogs.csv" ,header = TRUE, row.names=1, stringsAsFactors = FALSE)
head(ALL_one_to_one_orthologs_vtsujii_skogs)
#now lets just subset the Skogsbergia column for the randomization test
ALL_one_to_one_orthologs_vtsujii_skogs_SUBSET_SKOGS <- subset(ALL_one_to_one_orthologs_vtsujii_skogs, select ="skogs")
head(ALL_one_to_one_orthologs_vtsujii_skogs_SUBSET_SKOGS)
#need to remove .p1
ALL_one_to_one_orthologs_vtsujii_skogs_SUBSET_SKOGS_removep <- ALL_one_to_one_orthologs_vtsujii_skogs_SUBSET_SKOGS %>% separate(skogs, c("transcript_id", "extra"), sep ="\\.p")
ALL_one_to_one_orthologs_vtsujii_skogs_SUBSET_SKOGS_removep$extra <- NULL
ALL_one_to_one_orthologs_vtsujii_skogs_SUBSET_SKOGS_removep_rmdup <- ALL_one_to_one_orthologs_vtsujii_skogs_SUBSET_SKOGS_removep %>% distinct()
nrow(ALL_one_to_one_orthologs_vtsujii_skogs_SUBSET_SKOGS_removep_rmdup)
# now subset this to include all transripts expressed in Skogsbergia sp. networks
dds_merged_table_prefiltered_wgcna_transcript_ids <- as.data.frame(rownames(dds_merged_table_prefiltered_wgcna))
colnames(dds_merged_table_prefiltered_wgcna_transcript_ids)[1] <- "transcript_id"
#now subset
ALL_one_to_one_orthologs_skogs_SUBSET_to_wgcna_expression_input <- subset(ALL_one_to_one_orthologs_vtsujii_skogs_SUBSET_SKOGS_removep_rmdup, transcript_id %in% dds_merged_table_prefiltered_wgcna_transcript_ids$transcript_id)
head(ALL_one_to_one_orthologs_skogs_SUBSET_to_wgcna_expression_input)
nrow(ALL_one_to_one_orthologs_skogs_SUBSET_to_wgcna_expression_input)
#perform the randomization test
#total of 79 one to one orthologs across V.tsujii and Skogsbergia sp. transcriptomes but only 51 transcripts are expressed in the networks of Skogsbergia sp.
ALL_one_to_one_orthologs_skogs_df_repeat10000 <- replicate(10000, sample(ALL_one_to_one_orthologs_skogs_SUBSET_to_wgcna_expression_input$transcript_id, size = 51, replace = FALSE))
head(ALL_one_to_one_orthologs_skogs_df_repeat10000)
#function to extract the number of Skogsbergia sp. modules in which one-to-one BCN orthologs are found for the randomization test
matrix_1 <- c()
matrix_2 <- c()
matrix_3 <-c()
for(i in 1:ncol(ALL_one_to_one_orthologs_skogs_df_repeat10000 )){
matrix_1[[i]] <- ALL_one_to_one_orthologs_skogs_df_repeat10000 [ ,i]
matrix_2[[i]] <- transcripts_to_module %>% filter(transcript_id %in% matrix_1[[i]])
matrix_3[[i]]<- length(unique(matrix_2[[i]]$colors))
}
print(matrix_3)
matrix_3_df <- as.data.frame(matrix_3)
histogram_BCN_one_to_one_orthologs_skogs <- ggplot(matrix_3_df, aes(x=matrix_3))+ geom_histogram(binwidth=0.5, alpha=0.4)
#determine the total number of unique modules the randomized set of Skogsbergia sp. transcripts are found in.
table(matrix_3_df)
histogram_BCN_one_to_one_orthologs_skogs_prettyversion <-
ggplot(matrix_3_df, aes(x=matrix_3)) +#+ theme_classic()+
labs(x="Skogsbergia sp Modules", y = "Count") +
geom_histogram(color ="darkblue", fill="lightblue", size=1, bins=16)+ scale_x_continuous(breaks = seq(min(matrix_3_df$matrix_3), max(matrix_3_df$matrix_3)))+
theme(text=element_text(family = "Arial")) +
theme(axis.title.x = element_text(size = 20)) +
theme(axis.title.y = element_text(size = 20)) +
theme(axis.text.x = element_text(size = 15)) +
theme(axis.text.y = element_text(size = 15)) +
geom_vline(data=matrix_3_df, aes(xintercept=24, color="black"),
linetype="dashed", size =1)+labs(color = "") +
theme(legend.position = "none")
histogram_BCN_one_to_one_orthologs_skogs_prettyversion
P. Langfelder, S. Horvath, WGCNA: an R package for weighted correlation network analysis. BMC Bioinformatics 9, 559 (2008)
M. I. Love, W. Huber, S. Anders, Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2. Genome Biol. 15, 550 (2014)
Alexa A, Rahnenfuhrer J, topGO: Enrichment Analysis for Gene Ontology (2024)
Shannon, Paul et al., Cytoscape: a software environment for integrated models of biomolecular interaction networks. Genome research vol. 13,11 (2003)